Todo

  • Fix names in paper vs figures.
  • Could
    • Change CI on TOC vs Mud to Gamma, maybe others also.

Data cleaning

Code
# packages
library(tidyverse)
library(mgcv)
library(readxl)
library(ggmap)
library(janitor)
library(ggrepel)
library(GGally)
library(targets)
library(gt)
library(DataExplorer)
library(patchwork)
library(ggh4x)
library(ggthemes)
library(glmmTMB)
library(DHARMa)
library(performance)
library(MASS)
library(scales)
library(viridis)
library(gtsummary)
library(ggeffects)
library(multcomp)
library(broom)
library(broom.mixed)
library(parameters)
select <- dplyr::select
set.seed(12345)

# set theme
survey_theme <- theme_few()
theme_set(survey_theme)

source("pfas_scripts.R")


## for renaming pfas compounds with inconsequential naming
pfas_renamer <- function(pfas_in) {
  str_replace_all(
    pfas_in,
    c(
      "_a" = "_da",
      "pf_dc_da" = "PFDA",
      "pf_tri_da" = "PFTrDA",
      "pfos$" = "PFOS",
      "pfda" = "PFDA",
      "pfna" = "PFNA",
      "pf_hx_da" = "PFHxDA",
      "pf_hp_da" = "PFHpDA",
      "pfoa" = "PFOA",
      "pf_un_da" = "PFUnDA",
      "pf_do_da" = "PFDoDA",
      "pf_tr_da" = "PFTrDA",
      "s9_pfas" = "Σ9PFAS",
      "sum_9_pfas" = "Σ9PFAS"
    )
  )
}

## for renaming sample ids to fix inconsistencies
mareano_renamer <- function(ss_in) {
  
  sub_renamer <- function(ss_in){
  # check input
  stopifnot("requires single character string input." = length(ss_in) == 1 && is.character(ss_in))

  # for all strings remove spaces and replace BC with BX
  ss_out <- ss_in %>%
    str_replace_all("\\s", "") %>%
    str_replace_all("BC", "BX")
  # return if string do not contain MC or BX, as errors only with BX/MC
  if (str_detect(ss_out, "MC|BX", negate = TRUE)) {
    return(ss_out)}
  
  # part of name including BX/MC
  ss_prefix <- str_extract(ss_out, ".+[BXMC]+")
  # suffix-numbers with incoherent lengths
  ss_suffix <- str_extract(ss_out, "[0-9]+$")

  # change lengths
  if (str_length(ss_suffix) == 3) {
    return(ss_out)
  } else if (str_length(ss_suffix) == 2) {
    return(paste0(ss_prefix, "0", ss_suffix))
  } else if (str_length(ss_suffix) == 1) {
    return(paste0(ss_prefix, "00", ss_suffix))
  } else {
    # break with error message "suffix string not of length 1, 2 or 3":
    stop("suffix string not of length 1, 2 or 3")
  }
  }
  
  ss_out <-  map_chr(ss_in, sub_renamer)
  return(ss_out)
}

# to fix shifted column names
newnames <- (read_excel("data/Hele datasettet v2_corr.xlsx", skip = 178) %>% clean_names())[1, 16:29] %>% as_vector()

# appending correct column names
dataset <- read_excel("data/Hele datasettet v2_corr.xlsx", skip = 178) %>%
  clean_names() %>%
  rename_with(.cols = x16:x29, ~newnames) %>%
  clean_names() %>%
  slice(-1) %>%
  select(
    sample_id = sample,
    year = cruise_3,
    number = cruise_4,
    from_cm_o = sample_interval_top_bottom_5,
    to_cm_o = x6,
    longitude = dde,
    latitude = ddn,
    depth = mbsl,
    sample_id2 = x1096mc002,
    sample_id3 = sample_2,
    from_cm_i = sample_interval_top_bottom_12,
    to_cm_i = x13,
    clay_fraction,
    silt_fraction,
    Mud = mud_percent,
    TOC = toc_percent,
    pfos:pf_tri_a,
    s9_pfas = sum_9_pfas,
  ) %>%
  filter(
    !is.na(year)
  ) %>%
  mutate(across(!matches("sample"), as.numeric)) %>%
  # assign Areas:
  mutate(
    Area = case_when(
      row_number() %in% c(1:18) ~ "I",
      row_number() %in% c(19:45) ~ "II",
      row_number() %in% c(46:66) ~ "III",
      row_number() %in% c(67:89) ~ "IV",
      row_number() %in% c(90:95) ~ "V",
      TRUE ~ "Other"
    ),
    Area = as.factor(Area)
  ) %>%
  pivot_longer(
    cols = pfos:s9_pfas, names_to = "pfas", values_to = "pfas_value"
  ) %>%
  mutate(sample_id3 = mareano_renamer(sample_id3),
         pfas = pfas_renamer(pfas)) %>% 
  mutate(depth = case_when(
    sample_id3 == "R3165BX009" ~ 166,
    sample_id3 == "R3184BX013" ~ 277,
    sample_id3 == "R3105BX004" ~ 374,
    sample_id3 == "R3032BX001" ~ 419,
    sample_id3 == "R3004BX004" ~ 416,
    sample_id3 == "R2969BX003" ~ 855,
    sample_id3 == "R2924BX002" ~ 866,
    sample_id3 == "R1843BX038" ~ 277,
    TRUE ~ depth
),
depth = abs(depth) %>% round(digits = 0))

# import curated LOQs
loqs_raw <- read_excel("data/LOQs.xlsx") %>%
  clean_names() %>%
  select(-enhet) %>%
  rename(pfas = forbindelse)

loqs_years <- loqs_raw %>%
  filter(!(is.na(pfas))) %>%
  mutate(across(-pfas, ~ str_replace_all(.x, "\\<", "") %>% as.numeric()),
    pfas = make_clean_names(pfas)
  ) %>%
  rename_with(.cols = everything(), ~ str_replace(.x, "_2", "A") %>% str_replace("_3", "B")) %>%
  pivot_longer(cols = matches("\\d{4}"), names_to = "analysis_year", values_to = "loq")

loqs_samples <- loqs_raw %>%
  filter(is.na(pfas)) %>%
  rename_with(.cols = everything(), ~ str_replace(.x, "_2", "A") %>% str_replace("_3", "B")) %>%
  select(-pfas) %>%
  pivot_longer(
    cols = matches("\\d{4}"),
    names_to = "analysis_year",
    values_to = "sample_id"
  ) %>%
  drop_na() %>% 
  mutate(
    sample_id = mareano_renamer(sample_id),
    #correct sample_id error
    sample_id = if_else(sample_id == "R2924BX001", "R2924BX002", sample_id )
  )

loqs <- left_join(
  loqs_years, loqs_samples,
  by = c("analysis_year")
) %>% 
  mutate(loq = if_else(sample_id == "R3032BX001" & pfas == "pf_hp_a", 0.05, loq),
         pfas = pfas_renamer(pfas))

# merge data, assign values below LOQ
pfas_raw <- left_join(dataset, loqs, by = c("pfas", "sample_id3" = "sample_id")) %>%
  select(-analysis_year) %>%
  select(sample_id3, year, Area, longitude, latitude, Mud, TOC, pfas, pfas_value, loq, depth) %>%
  mutate(
    # to avoid floating point problems using ><=
    across(c("loq", "pfas_value"), ~ .x %>% round(digits = 6)),
    Area = as_factor(Area)
  )
  
pfas_data_eq0 <- pfas_raw %>%
  group_by(pfas) %>%
mutate(max_loq = max(loq %>% replace_na(0), na.rm = TRUE)) %>% 
  mutate(
    pfas_value = case_when(
      pfas_value < max_loq ~ 0,
      pfas_value %>% is.na ~ 0,
      TRUE ~ pfas_value
    )
  ) %>%
  ungroup() %>%
  group_by(sample_id3) %>%
  select(-max_loq) %>%
  mutate(pfas = factor(pfas, levels = c("PFHxDA", "PFHpDA", "PFOA", "PFNA", "PFDA", "PFUnDA", "PFDoDA", "PFTrDA", "PFOS", "Σ9PFAS"))) %>%
    mutate(
    pfas_value = if_else(
      pfas == "Σ9PFAS",
      sum(pfas_value[pfas != "Σ9PFAS"], na.rm = TRUE),
      pfas_value
    )
  )

pfas_data_eq0_wide <- pivot_wider(pfas_data_eq0 %>% select(-c(loq)), names_from = pfas, values_from = pfas_value) %>% ungroup()

## Using individual sample LOQs
pfas_data_0 <- pfas_raw %>%
  group_by(pfas) %>%
  mutate(max_loq = max(loq %>% replace_na(0), na.rm = TRUE)) %>% 
  mutate(
    pfas_value = case_when(
      pfas_value < loq ~ 0,
      pfas_value %>% is.na ~ 0,
      TRUE ~ pfas_value
    )
  ) %>%
  ungroup() %>%
  group_by(sample_id3) %>%
  mutate(pfas = factor(pfas, levels = c("PFHxDA", "PFHpDA", "PFOA", "PFNA", "PFDA", "PFUnDA", "PFDoDA", "PFTrDA", "PFOS", "Σ9PFAS"))) %>%
    mutate(
    pfas_value = if_else(
      pfas == "Σ9PFAS",
      sum(pfas_value[pfas != "Σ9PFAS"], na.rm = TRUE),
      pfas_value
    )
  ) %>% 
  mutate(
    pfas_value = if_else(pfas_value >= loq & pfas_value < max_loq, paste0(format(round(pfas_value, digits = 2), nsmall = 2), "*"), paste0(format(round(pfas_value, digits = 2), nsmall = 2)))
  )
  
pfas_data_0_wide <- pivot_wider(pfas_data_0 %>% select(-c(loq, max_loq)), names_from = pfas, values_from = pfas_value) %>% ungroup()

The LOQs varied between years; therefore, to avoid the potential of LOQ serving as a confounding variable, the highest value observed was uniformly applied. Values falling below the LOQ were assigned a zero, yielding conservative estimates of concentrations, and in comparison to assigning LOQ/2, resulting in a marginal rise in the coefficient estimates associated with mud.

Modeling

Fit

A generalized linear model with Poisson-gamma distributed errors and a logarithmic link function was employed to analyze the zero-inflated data censored by the limits of quantification. This model incorporated the predictors Area, TOC, and Mud and was implemented using the ‘glmmTMB’ package in R. Data exploration showed Mud to be collinear with TOC, and stepwise model selection based on Akaike’s information criteria showed Mud to have higher predictive value than TOC. Thus, Area and Mud were retained for the regression analysis. To test for differences between the Areas, pairwise comparisons were performed using the ‘multcomp’ package with single-step p-value adjustment. Simulation-based model validation was carried out using the ‘DHARMa’ and ‘ggeffects’ packages. The code to reproduce the statistical analysis is provided in an online repository.

Code
pfas_glm <- function(in_dat){
  glmmTMB(data = in_dat, formula = pfas_value ~ Area + Mud, family = tweedie(link = "log"))
}

pfas_w_models <-
  pfas_data_eq0 %>%
  filter((pfas %in% c("PFOS", "PFOA", "PFNA", "PFUnDA", "Σ9PFAS"))) %>%
  ## inspect effect of excluding ares with high PFAS concentrations and high Mud content
  # filter(!(Area %in% c("I", "V"))) %>%
  mutate(
    pfas = fct_relevel(pfas, c("Σ9PFAS", "PFOS", "PFOA", "PFNA", "PFUnDA")),
    pfas_label = pfas
  ) %>%
  arrange(pfas) %>%
  group_by(pfas) %>%
  nest() %>%
  mutate(
    m_fit = map(data, pfas_glm),
    tidy = map(m_fit, ~ broom.mixed::tidy(.x, exponentiate = TRUE)),
    glance = map(m_fit, broom.mixed::glance),
    augment = map2(m_fit, data, ~ broom.mixed::augment(.x, .y)),
    pred = map(m_fit, ~ ggpredict(.x, terms = c("Mud [1:100]", "Area"))),
    simulated = map(m_fit, ~ simulate(.x, nsim = 25, seed = 12345)),
    sim_data = map2(
      simulated, data, ~ bind_cols(.x, .y) %>%
        pivot_longer(cols = matches("sim"), names_to = "simgroup", values_to = "sim") %>%
        rename(group = Area)
    ),
    Area_model = map2(m_fit, pfas, ~ glht(.x, linfct = mcp(Area = "Tukey"))),
    # # validate the parameters:: implementation
    # Area_hypothesis = map2(m_fit, pfas, ~ glht(.x, linfct = mcp(Area = "Tukey")) %>%
    #   summary(test = adjusted(type = "single-step")) %>%
    #   tidy() %>%
    #   select(-c(term, null.value)) %>%
    #   mutate(estimate = exp(estimate))),
    reg_table = map(m_fit, ~ tbl_regression(.x, exponentiate = TRUE, tidy_fun = broom.mixed::tidy), estimate_fun = purrr::partial(style_ratio, digits = 3))
  )

Model validation

Details and model validation
Code
sim_data <- pfas_w_models %>% 
  unnest(sim_data)

pred_data <- pfas_w_models %>% unnest(pred)

pred_data %>% ggplot(aes(x, predicted)) +
  geom_smooth(color = "#AF2D2D") +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  geom_point(data = sim_data, aes(x = Mud, y = sim), alpha = 0.1, size = 0.2, color = muted("red", l = 50, c = 70))+
  geom_point(aes(x = Mud, y = pfas_value),
             data = sim_data %>% distinct(sample_id3, .keep_all = TRUE)) +
  facet_grid(pfas~group, scales = "free")+
  xlab("Mud [%]")+
  ylab("PFAS [\u03BCg/kg]")

Shown are the model fits on the original scale, with observations shown by black dots. Small red dots shows 25 simulations from the model for each combination of observed covariate values.

Code
# dharma_printer <- function(mod, in_dat) {
#   sim_resid <- simulateResiduals(fittedModel = mod, n = 1000)
#   pfas_name <- in_dat$pfas_label[1]
#   ## for naming plots
#   #   plot.new()
#   # mtext(paste(pfas_name), cex = 4)
#   testZeroInflation(sim_resid)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+")), side = 4, col = 2, cex = 1.5)
# 
#   plotQQunif(sim_resid)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+")), side = 4, col = 2, cex = 1.5)
# 
#   plotResiduals(sim_resid)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+"), " (T-P=", glmmTMB:::.tweedie_power(mod) %>% round(digits = 3), ", d=", sigma(mod) %>% round(digits = 3), ")"), side = 4, col = 2, cex = 1.5)
# 
#   plotResiduals(mod, form = in_dat$Area)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+")), side = 4, col = 2, cex = 1.5)
# 
#   plotResiduals(mod, form = in_dat$TOC)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+")), side = 4, col = 2, cex = 1.5)
# 
#   plotResiduals(sim_resid, form = in_dat$year)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+")), side = 4, col = 2, cex = 1.5)
# 
#   plotResiduals(sim_resid, form = in_dat$latitude)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+")), side = 4, col = 2, cex = 1.5)
# 
#   plotResiduals(sim_resid, form = in_dat$longitude)
#   mtext(paste0(pfas_name, "~", str_flatten(mod$frame %>% select(-1) %>% colnames(), collapse = "+")), side = 4, col = 2, cex = 1.5)
# }
# 
# map2(pfas_w_models$m_fit, pfas_w_models$data, ~ dharma_printer(.x, .y))

Plots above show scaled quantile residuals using the DHARMa package. These are obtained by simulating from the fitted model and comparing to the observed residuals. The DHaRMa residuals are expected to be uniformly distributed around 0, red lines parallel and near the dotted grey lines. The quantile-quantile plot are expected to follow the 45-degree line. Plots are grouped by PFAS and Area.

Comments:

Results

Overview and summary

Code
map_Area_conc <-
  ggmap(tar_read(stadia_map)) +
  geom_point(aes(x = longitude, y = latitude, color = Σ9PFAS, fill = Σ9PFAS, shape = Area), data = pfas_data_eq0_wide, position = position_dodge(width = 10), size = 2, alpha = 0.7) +
  labs(x = "Longitude [°E]", y = "Latitude [°N]") +
  scale_color_viridis(option = "magma") +
  scale_fill_viridis(option = "magma") +
  scale_shape_manual(values = c(21, 22, 23, 24, 25)) +
  #change legend label to Area and \u03A39PFAS:
  labs(color = "\u03A39PFAS", shape = "Area")+
  guides(fill = FALSE)

map_Area_conc

Code
## year
  # ggmap(tar_read(stadia_map)) +
  # geom_point(aes(x = longitude, y = latitude, color = year, fill = year, shape = Area), data = pfas_data_eq0_wide, position = position_dodge(width = 10), size = 2, alpha = 0.7) +
  # labs(x = "Longitude [°E]", y = "Latitude [°N]") +
  # scale_color_viridis(option = "magma") +
  # scale_fill_viridis(option = "magma") +
  # scale_shape_manual(values = c(21, 22, 23, 24, 25)) +
  # #change legend label to Area and \u03A39PFAS:
  # labs(color = "Year", shape = "Area")+
  # guides(fill = FALSE)

## depth
  # ggmap(tar_read(stadia_map)) +
  # geom_point(aes(x = longitude, y = latitude, color = depth, fill = depth, shape = Area), data = pfas_data_eq0_wide %>% filter(depth < 200), position = position_dodge(width = 10), size = 2, alpha = 0.7) +
  # labs(x = "Longitude [°E]", y = "Latitude [°N]") +
  # scale_color_viridis(option = "magma") +
  # scale_fill_viridis(option = "magma") +
  # scale_shape_manual(values = c(21, 22, 23, 24, 25)) +
  # #change legend label to Area and \u03A39PFAS:
  # labs(color = "Depth [m]", shape = "Area")+
  # guides(fill = FALSE)
Code
pfas_data_eq0_wide %>%
  group_by(Area) %>%
  # mean, sd for each Area for each of Σ9PFAS, TOC, Mud:
  summarise(
    across(Mud:Σ9PFAS, mean, na.rm = TRUE),
    n = n()
  ) %>%
  gt() %>%
  fmt_number(columns = 2:13, decimals = 1) %>%
  tab_spanner(label = "Mean", columns = 2:13) %>%
  tab_header(title = "Summary of covariates by Area, <LOQ = 0") %>%
  cols_label(
    Area = md("**Area**"),
    n = md("**n**"),
    Σ9PFAS = md("**\u03A39PFAS**"),
    PFHxDA = md("**PFHxDA**"),
    PFHpDA = md("**PFHpdA**"),
    PFDA = md("**PFDA**"),
    PFOS = md("**PFOS**"),
    PFOA = md("**PFOA**"),
    PFNA = md("**PFNA**"),
    PFTrDA = md("**PFTrDA**"),
    PFDoDA = md("**PFDoDA**"),
    PFUnDA = md("**PFUnDA**"),
    TOC = md("**TOC**"),
    Mud = md("**Mud**")
  )
Summary of covariates by Area, <LOQ = 0
Area Mean Σ9PFAS n
Mud TOC depth PFOS PFHxDA PFHpdA PFOA PFNA PFDA PFUnDA PFDoDA PFTrDA
I 88.9 1.4 397.0 0.3 0.0 0.1 0.4 0.3 0.0 0.1 0.0 0.0 1.2155233 18
II 65.9 1.6 386.0 0.2 0.0 0.0 0.4 0.2 0.0 0.0 0.0 0.0 0.8274597 27
III 48.7 0.9 373.3 0.3 0.0 0.0 0.3 0.2 0.0 0.1 0.0 0.0 0.7867911 21
IV 75.7 1.1 474.8 0.2 0.0 0.0 0.3 0.2 0.0 0.0 0.0 0.0 0.7719000 23
V 93.4 2.6 336.0 0.8 0.0 0.1 0.4 0.4 0.1 0.3 0.0 0.1 2.1211085 6
Code
pfas_ratios <- pfas_data_eq0_wide %>%
  mutate(
    "PFOA_to_PFNA" = if_else(PFOA == 0 | PFNA == 0, NA, PFOA / PFNA),
    "PFDA_to_PFUnDA" = if_else(PFDA == 0 | PFUnDA == 0, NA, PFDA / PFUnDA)
  ) %>%
  select("PFOA_to_PFNA", "PFDA_to_PFUnDA", PFOA, PFNA, PFDA, PFUnDA, sample_id3:TOC) %>%
  pivot_longer(
    cols = c("PFOA_to_PFNA", "PFDA_to_PFUnDA"),
    names_to = "pfas",
    values_to = "Ratio"
  ) %>% mutate(c1 = if_else(pfas == "PFOA_to_PFNA", PFOA, PFDA),
               c2 = if_else(pfas == "PFOA_to_PFNA", PFNA, PFUnDA)
  ) %>% 
  mutate(pfas = case_when(
      pfas == "PFOA_to_PFNA" ~ "PFOA : PFNA",
      pfas == "PFDA_to_PFUnDA" ~ "PFDA : PFUnDA"
    )
  )
  
  pfas_ratios %>% 
  ggplot(aes(Area, Ratio)) +
  geom_jitter(aes(fill = Area), width = 0.2, size = 3, colour = "black", shape = 21) +
  # mean for each Area using stat summary:
  stat_summary(aes(group = 1),
    fun = mean, geom = "line",
    linetype = "dotted",
    size = 0.5, fill = "black"
  ) +
    scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"))+
  #   scale_color_viridis(option = "magma") +
  # scale_fill_viridis(option = "magma")+
  stat_summary(geom = "errorbar", fun.data = mean_cl_normal, fun.args = list(conf.int = 0.95), size = 0.7, width = 0.5) +
  xlab("Area") +
  guides(fill = FALSE) +
  facet_wrap(~pfas, scales = "free")

Code
  pfas_ratios %>% group_by(Area, pfas) %>% summarise( mean =  mean(Ratio, na.rm = T)) %>% 
    mutate(pfas = case_when(
      pfas == "PFOA_to_PFNA" ~ "PFOA : PFNA",
      pfas == "PFDA_to_PFUnDA" ~ "PFDA : PFUnDA"
    )) %>% 
    pivot_wider(
      names_from = Area,
      values_from = mean) %>% 
    gt() %>% 
    fmt_number(2:6, decimals = 1)
pfas I II III IV V
NA 0.5932791, 1.6218117 0.5161325, 1.6982291 0.5800105, 1.5953601 0.6052262, 1.7720249 0.5843074, 1.0098973

Outlier check

Code
outlier_cand <- pfas_data_eq0_wide %>%
    mutate(
        PFOA_to_PFNA = if_else(PFOA == 0 | PFNA == 0, NA, PFOA / PFNA),
        PFDA_to_PFUnDA = if_else(PFDA == 0 | PFUnDA == 0, NA, PFDA / PFUnDA)
    ) %>%
  filter(PFOA_to_PFNA > 3 | PFDA_to_PFUnDA < 0.3)

outlier_cand %>% gt() %>% fmt_number(columns = longitude:PFDA_to_PFUnDA)
sample_id3 year Area longitude latitude Mud TOC depth PFOS PFHxDA PFHpDA PFOA PFNA PFDA PFUnDA PFDoDA PFTrDA Σ9PFAS PFOA_to_PFNA PFDA_to_PFUnDA
R1582BX007 2016 I 28.33 78.15 89.60 1.28 307.00 0.21 0.00 0.00 0.47 0.13 0.00 0.00 0.00 0.00 0.80 3.69 NA
R0739MC019 2011 IV 11.16 67.80 55.50 0.54 264.00 0.00 0.00 0.00 0.30 0.10 0.00 0.00 0.00 0.00 0.40 3.12 NA
R1565MC097 2015 IV 8.24 66.56 61.00 0.50 330.00 0.00 0.00 0.00 0.42 0.12 0.00 0.00 0.00 0.00 0.54 3.50 NA
P2002009 2020 V 5.77 58.94 84.20 5.50 240.00 0.64 0.00 0.00 0.17 0.00 0.12 0.67 0.29 0.38 2.27 NA 0.17
Code
library(leaflet)
library(leafem)


leaflet(outlier_cand) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~longitude,
                   lat = ~latitude,
                   radius = 2,
                   popup = ~ paste(
                     "PFDA_to_PFUnDA :",
                     PFDA_to_PFUnDA,
                     "<br>",
                     "PFOA_to_PFNA :",
                     PFOA_to_PFNA,
                     "<br>",
                     "sample_id :",
                     sample_id3)) %>% 
  addLabelOnlyMarkers(
   lng = ~longitude,
                   lat = ~latitude,
                   label = ~ paste(
                     "PFDA_to_PFUnDA :",
                     PFDA_to_PFUnDA %>% round(digits = 1),
                     "//",
                     "PFOA_to_PFNA :",
                     PFOA_to_PFNA %>% round(digits = 1),
                     "//",
                     "sample_id :",
                     sample_id3),
    # labelOptions = labelOptions(noHide = T))
    labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T)
  )
Code
pfas_prop <- pfas_data_eq0 %>%
  # for each Area, calculate the proportion of each pfas:
  group_by(Area, pfas) %>%
  summarise(
    pfas_sum = sum(pfas_value, na.rm = TRUE),
  ) %>%
  group_by(Area) %>%
  mutate(
    pfas_prop = pfas_sum / sum(pfas_sum[pfas != "Σ9PFAS"], na.rm = TRUE)
  ) %>%
  filter(pfas != "Σ9PFAS")

ggplot(pfas_prop, aes(fill = pfas, y = pfas_prop, x = Area)) +
  geom_bar(position = "stack", stat = "identity", width = 0.88, color = "black") +
  scale_fill_brewer(palette = "Set3") +
  xlab("Area") +
  ylab("Propotion") +
  # legend title PFAS:
  labs(fill = "")

PFAS vs Mud by Area

Code
pred_data %>%
  filter(pfas == "Σ9PFAS") %>%
  ggplot(aes(x, predicted)) +
  geom_point(aes(x = Mud, y = pfas_value, fill = group), color = "black", shape = 21,
    data = sim_data %>% filter(pfas == "Σ9PFAS") %>% distinct(sample_id3, .keep_all = TRUE)
  ) + 
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
    geom_smooth(color = "#AF2D2D") +
  scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"))+
  facet_wrap(~ group) +
  guides(color = FALSE, size = FALSE, fill = FALSE)+
  xlab("Mud [%]") +
  ylab("\u03A39PFAS [\u03BCg/kg]")

Code
pred_data %>%
  filter(pfas != "Σ9PFAS") %>%
  ggplot(aes(x, predicted)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  geom_point(aes(x = Mud, y = pfas_value, fill = group), color = "black", shape = 21,
    data = sim_data %>% filter(pfas != "Σ9PFAS") %>% distinct(sample_id3, .keep_all = TRUE)
  ) +
  geom_smooth(color = "#AF2D2D") +
    scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"))+
  facet_grid(pfas ~ group, scales = "free") +
  guides(fill = FALSE)+
  xlab("Mud [%]") +
  ylab("[\u03BCg/kg]")

PCA

Code
library(factoextra)
library(FactoMineR)
# apply PCA
pca <- pfas_data_eq0_wide %>% select(Mud:"Σ9PFAS") %>%
  select(-"Σ9PFAS", 
         -"PFDoDA",
         -"PFTrDA",
         -depth
         ) %>%
  prcomp(center = TRUE, scale. = TRUE)

Area_pcatag <- pfas_data_eq0_wide %>% select(Area)



pca_Area <- fviz_pca_biplot(pca,
                axes = c(1,2),
                repel = TRUE,
                fill.ind = Area_pcatag$Area,
                pointshape = 21,
                # col.ind = Area_pcatag,
                  palette = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"),
                pointsize = 4,
                addEllipses = FALSE,
                col.var = "black",
                title = "",
                mean.point = FALSE,
                # ellipse.type = "confidence",
                legend.title = "Area",
                labelsize = 4,
                label = "var",
                invisible = "none",
                ggtheme = survey_theme)#"ind")

pca_Area

Code
ggsave("pca-pfas-eq0.pdf")

Effect of Mud

Code
pfas_w_models %>%
  select(pfas, m_fit) %>%
  mutate(
    regression_table = map(m_fit, ~ model_parameters(.x, exponentiate = TRUE) %>% add_row(Parameter = "(Tweedie power)", Coefficient = glmmTMB:::.tweedie_power(.x)))
    # tweedie_power = map(m_fit, ~ glmmTMB:::.tweedie_power(.x))
  ) %>%
  unnest(regression_table) %>%
  mutate(
    Parameter = if_else(Component == "dispersion" & Parameter == "(Intercept)", "(Tweedie dispersion)", Parameter)
  ) %>%
  select(pfas, Parameter, Estimate = Coefficient, CI_low, CI_high, p) %>% 
  filter(Parameter == "Mud") %>% 
  ggplot(aes(x = Estimate, y = pfas)) + geom_point() + geom_errorbarh(aes(xmin = CI_low, xmax = CI_high)) + geom_vline(xintercept = 1, linetype = "dashed", color = "#AF2D2D") + labs(x = "Exponentiated coefficient of Mud", y = "PFAS") +
  scale_x_continuous(breaks = c(1, 1.01, 1.02, 1.03, 1.04, 1.05))+
  scale_y_discrete(limits=rev)
Figure 1: Relative contribution of Mud to the concentration of different PFASes. 1.01 corresponds to 1 % increase of PFAS concentration per % increase in Mud.

Area hypothesis

Code
Areas_table <- pfas_w_models %>%
  select(pfas, Area_model) %>%
  mutate(
    Area_hypothesis = map(Area_model, ~ model_parameters(.x, exponentiate = TRUE))
  ) %>%
  unnest(Area_hypothesis) %>%
  select(-c(Area_model, SE, CI, z, df_error))

Areas_table %>%
  rename("exp(estimate)" = Coefficient) %>%
  gt(groupname_col = "pfas") %>%
  fmt_number(columns = 3:6, decimals = 2) %>% 
  tab_style(style = list( 
                         cell_text(weight = 'bold')), 
            locations = cells_column_labels(columns=everything())
  ) %>% 
  tab_style(
  style = cell_text(weight = "bold"),
  locations = cells_row_groups(groups = everything())
) %>% 
   tab_style(style = list( 
                         cell_text(weight = 'bold')), 
            locations = cells_body(columns=everything(), 
                                   rows = p <= 0.05)
  )
Parameter exp(estimate) CI_low CI_high p
Σ9PFAS
II - I == 0 0.88 0.41 1.87 0.99
III - I == 0 0.95 0.40 2.24 1.00
IV - I == 0 0.73 0.35 1.56 0.79
V - I == 0 1.69 0.69 4.13 0.49
III - II == 0 1.08 0.51 2.25 1.00
IV - II == 0 0.83 0.41 1.71 0.96
V - II == 0 1.92 0.76 4.85 0.30
IV - III == 0 0.77 0.35 1.73 0.91
V - III == 0 1.79 0.65 4.94 0.52
V - IV == 0 2.30 0.92 5.79 0.10
PFOS
II - I == 0 0.90 0.40 2.02 1.00
III - I == 0 1.42 0.60 3.36 0.80
IV - I == 0 0.93 0.42 2.05 1.00
V - I == 0 2.40 1.05 5.51 0.03
III - II == 0 1.59 0.73 3.44 0.48
IV - II == 0 1.04 0.48 2.25 1.00
V - II == 0 2.68 1.12 6.43 0.02
IV - III == 0 0.65 0.29 1.45 0.60
V - III == 0 1.69 0.67 4.25 0.53
V - IV == 0 2.58 1.10 6.03 0.02
PFOA
II - I == 0 0.86 0.42 1.78 0.98
III - I == 0 0.67 0.27 1.63 0.73
IV - I == 0 0.62 0.29 1.33 0.42
V - I == 0 0.89 0.31 2.49 1.00
III - II == 0 0.77 0.36 1.66 0.89
IV - II == 0 0.72 0.35 1.48 0.71
V - II == 0 1.03 0.36 2.96 1.00
IV - III == 0 0.92 0.39 2.18 1.00
V - III == 0 1.33 0.40 4.35 0.97
V - IV == 0 1.43 0.49 4.24 0.89
PFNA
II - I == 0 0.80 0.39 1.67 0.92
III - I == 0 0.75 0.31 1.82 0.90
IV - I == 0 0.65 0.30 1.37 0.50
V - I == 0 1.09 0.43 2.73 1.00
III - II == 0 0.94 0.42 2.08 1.00
IV - II == 0 0.80 0.38 1.69 0.93
V - II == 0 1.36 0.51 3.58 0.91
IV - III == 0 0.86 0.36 2.03 0.99
V - III == 0 1.45 0.49 4.31 0.89
V - IV == 0 1.68 0.63 4.50 0.59
PFUnDA
II - I == 0 0.53 0.07 4.06 0.91
III - I == 0 1.96 0.34 11.25 0.83
IV - I == 0 0.96 0.17 5.33 1.00
V - I == 0 4.53 0.99 20.74 0.05
III - II == 0 3.72 0.55 24.96 0.33
IV - II == 0 1.83 0.25 13.29 0.92
V - II == 0 8.60 1.24 59.75 0.02
IV - III == 0 0.49 0.10 2.54 0.76
V - III == 0 2.32 0.45 11.79 0.62
V - IV == 0 4.70 0.97 22.77 0.06
  • For Σ9PFAS, no p-values are significant, indicating that no clear differences between Areas are observed.
  • However, for individual compounds some differences are observed:
    • For PFOS, there are differences between Area V versus I, II and IV.
    • For PFUnDA, there are differences between V versus I, II and IV.
  • Overall, differences are driven by Areas I and V. This is further shown by residual plots below.
Code
model_preds <-
  map2(pfas_w_models$m_fit, pfas_w_models$pfas, ~ {
    mud_percent <- 71
    predict_response(.x, terms = "Area", condition = c("Mud" = mud_percent)) %>% plot() + ylab("Predicted") + theme_bw(base_size = 12) + labs(title = NULL, subtitle = paste(.y, " | Mud% = ", mud_percent))
    # ggsave(paste0(.y, "_", mud_percent, ".pdf"))
  })
wrap_plots(model_preds)

Latitude hypothesis

Code
# residuals vs latitude without Area categories
pfas_glm <- function(in_dat){
  glmmTMB(data = in_dat, formula = pfas_value ~  Mud, family = tweedie(link = "log"))
}

Area_wmodels <-
  pfas_data_eq0 %>%
  filter(!(pfas %in% c("PFHxDA", "PFDoDA", "PFTrDA"))) %>%
  ## inspect effect of excluding ares with high PFAS concentrations and high Mud content
  # filter(!(Area %in% c("I", "V"))) %>%
  mutate(
    # pfas = fct_relevel(pfas, c("Σ9PFAS", "PFOS", "PFOA", "PFNA", "PFUnDA")),
    pfas_label = pfas
  ) %>%
  arrange(pfas) %>%
  group_by(pfas) %>%
  nest() %>%
  mutate(
    m_fit = map(data, pfas_glm),
    tidy = map(m_fit, ~ broom.mixed::tidy(.x, exponentiate = TRUE)),
    glance = map(m_fit, broom.mixed::glance),
    augment = map2(m_fit, data, ~ broom.mixed::augment(.x, .y)),
    pred = map(m_fit, ~ ggpredict(.x, terms = c("Mud [1:100]"), back_transform = TRUE)),
    simulated = map(m_fit, ~ simulate(.x, nsim = 25, seed = 12345)),
    sim_data = map2(
      simulated, data, ~ bind_cols(.x, .y) %>%
        pivot_longer(cols = matches("sim"), names_to = "simgroup", values_to = "sim")
    ),
    reg_table = map(m_fit, ~ tbl_regression(.x, exponentiate = TRUE, tidy_fun = broom.mixed::tidy), estimate_fun = purrr::partial(style_ratio, digits = 3))
  )


Area_wmodels %>%
  unnest(augment) %>%
  ggplot(aes(x = latitude, y = .resid)) +
  geom_point(aes(fill = Area), shape = 21, color = "black") +
  geom_smooth(color = "#AF2D2D") +
  scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c")) +
  facet_wrap(~pfas, scale = "free")+
  scale_x_reverse() +
  ylab("Residuals")+
  xlab("Latitude")

# test colours:
# testcol <- expand_grid(my_l = seq(0,100, 10), my_c= seq(0,100, 10)) %>% 
#   # mutate(color = muted("red", l = my_l, c = my_c))
#   mutate(hei = map2(my_l, my_c, ~muted("blue", .x, .y))
#   )
# show_col(testcol$hei %>% as_vector())
Figure 2: Residuals of the model for PFAS concentration vs Mud plotted against latitude. The red line is the fitted loess curve. The residuals are not independent of latitude, indicating that the model does not capture all the variation in the data. We see that this is driven primarily by Areas I and V.
Code
# Area_wmodels %>%
#   unnest(augment) %>%
#   ggplot(aes(x = Area, y = .resid)) +
#   geom_boxplot()+
#   geom_jitter()+
#   facet_wrap(~pfas, scale =)+
#   xlab("Area")
Code
# latitude coefficient hypothesis
tbl_merge(
  (Area_wmodels %>% 
    filter(pfas %in% c("Σ9PFAS", "PFOS", "PFOA", "PFNA", "PFUnDA", "PFDA","PFHpDA")))$reg_table,
    tab_spanner =  Area_wmodels %>% pull(pfas) %>% as.character()
    ) %>% as_gt()
Characteristic PFHpDA PFOA PFNA PFDA PFUnDA PFOS Σ9PFAS
exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value
Mud 1.03 1.00, 1.06 0.024 1.01 1.00, 1.01 0.10 1.01 1.01, 1.02 <0.001 1.03 1.01, 1.06 0.009 1.02 1.00, 1.04 0.017 1.02 1.01, 1.02 <0.001 1.01 1.01, 1.02 <0.001
1 CI = Confidence Interval

Applying the model with covariates ‘C{PFAS} = Mud + latitude’, yields the significance of covariates shown in table above. Latitude is not a significant covariate for Σ9PFAS. Some trends in the individual PFASes point toward lower concentrations at higher latitudes for PFOS, PFUnA, and PFDcA, whereas PFOA may increase going northward. However, as shown in Figure 2, Areas I and V exert leverage on the data and local hotspots serve as hotspots driving much of the observed spatial variance. Thus, differences (and lack of thereof) may be driven largely by locations I and V. We also see that the variation is bigger for Areas I and V, and correspondingly, for low and high latitudes. This makes inference on the background levels difficult.

Depth and time hypothesis

Code
# residuals vs depth with Area categories

# pfas_w_models %>%
#   unnest(augment) %>% 
#   ggplot(aes(x = depth, y = .resid)) +
#   geom_point(aes(fill = Area), shape = 21, color = "black") +
#   geom_smooth(color = "#AF2D2D") +
#   scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c")) +
#   facet_wrap(~pfas, scale = "free")+
#   ylab("Residuals")+
#   xlab("Depth [m]")


Area_wmodels %>%
  unnest(augment) %>% 
  ggplot(aes(depth, pfas_value))+
  geom_point(aes(fill = Area), shape = 21, color = "black") +
  facet_wrap(~pfas, scale = "free")+
   scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"))+
  ylab("Concentration [µg/kg dry weight]")+
  xlab("Depth [m]")
ggsave("depth_vs_conc.png", width = 7, height = 4.5)


Area_wmodels %>%
  filter(pfas == "PFOA") %>% 
  unnest(augment) %>%
  ggplot(aes(Area, depth))+
  geom_jitter(aes(fill = Area), shape = 21, color = "black", width = 0.15) +
  geom_boxplot(alpha = 0.5, outliers = FALSE)+
  # facet_wrap(~pfas, scale = "free")+
   scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"))+
  ylab("Depth [m]")+
  xlab("Area")+
  theme(legend.position = "none")
ggsave("depth_vs_area.png", width = 7, height = 4.5)
Figure 3: Residuals of the model for PFAS concentration vs Mud plotted against depth. The red line is the fitted loess curve.
Figure 4: Residuals of the model for PFAS concentration vs Mud plotted against depth. The red line is the fitted loess curve.
Code
Area_wmodels %>%
  unnest(augment) %>% 
  ggplot(aes(year, pfas_value))+
  geom_point(aes(fill = Area), shape = 21, color = "black") +
  facet_wrap(~pfas, scale = "free")+
   scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"))+
  ylab("Concentration [µg/kg dry weight]")+
  xlab("Year")

Code
Area_wmodels %>%
  unnest(augment) %>% 
  filter(pfas == "Σ9PFAS") %>% 
  ggplot(aes(Area, year))+
  geom_jitter(aes(fill = Area), shape = 21, color = "black", width = 0.15) +
  geom_boxplot(alpha = 0.5, outliers = FALSE)+   scale_fill_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"))+
  ylab("Year")+
  xlab("Area")+
  theme(legend.position = "none")

Supplementary

Pair plot

Code
ggpairs(pfas_data_eq0_wide %>% 
          select(
Area, Mud, TOC, depth, Σ9PFAS, PFOS, PFOA, PFNA, PFUnDA, PFDA, PFHpDA
          ),
        progress = FALSE)

Regression table

Code
pfas_w_models %>%
  select(pfas, m_fit) %>%
  mutate(
    regression_table = map(m_fit, ~ model_parameters(.x, exponentiate = TRUE) %>% add_row(Parameter = "(Tweedie power)", Coefficient = glmmTMB:::.tweedie_power(.x)))
    # tweedie_power = map(m_fit, ~ glmmTMB:::.tweedie_power(.x))
  ) %>%
  unnest(regression_table) %>%
  mutate(
    Parameter = if_else(Component == "dispersion" & Parameter == "(Intercept)", "(Tweedie dispersion)", Parameter)
  ) %>%
  select(pfas, Parameter, Estimate = Coefficient, CI_low, CI_high, p) %>%
  gt() %>%
  fmt_number(columns = 4:6, decimals = 2) %>%
  fmt_number(columns = 3, decimals = 3) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold")
    ),
    locations = cells_column_labels(columns = everything())
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_row_groups(groups = everything())
  ) %>%
  tab_header(title = "Regression table", subtitle = "Model coefficients exponentiated, whereas tweedie parameters are not.")
Regression table
Model coefficients exponentiated, whereas tweedie parameters are not.
Parameter Estimate CI_low CI_high p
Σ9PFAS
(Intercept) 0.458 0.19 1.10 0.08
AreaII 0.880 0.51 1.51 0.65
AreaIII 0.948 0.51 1.76 0.86
AreaIV 0.734 0.43 1.26 0.26
AreaV 1.692 0.89 3.22 0.11
Mud 1.011 1.00 1.02 0.02
(Tweedie dispersion) 0.722 0.55 0.94 NA
(Tweedie power) 1.319 NA NA NA
PFOS
(Intercept) 0.082 0.03 0.21 0.00
AreaII 0.896 0.50 1.61 0.71
AreaIII 1.423 0.77 2.64 0.26
AreaIV 0.932 0.53 1.64 0.81
AreaV 2.404 1.32 4.37 0.00
Mud 1.015 1.01 1.02 0.00
(Tweedie dispersion) 0.247 0.19 0.32 NA
(Tweedie power) 1.103 NA NA NA
PFOA
(Intercept) 0.316 0.13 0.74 0.01
AreaII 0.863 0.51 1.46 0.58
AreaIII 0.667 0.35 1.27 0.22
AreaIV 0.617 0.35 1.07 0.09
AreaV 0.885 0.42 1.87 0.75
Mud 1.004 1.00 1.01 0.38
(Tweedie dispersion) 0.304 0.23 0.41 NA
(Tweedie power) 1.139 NA NA NA
PFNA
(Intercept) 0.108 0.04 0.27 0.00
AreaII 0.802 0.47 1.36 0.41
AreaIII 0.752 0.40 1.42 0.38
AreaIV 0.645 0.37 1.11 0.11
AreaV 1.087 0.56 2.11 0.81
Mud 1.012 1.00 1.02 0.01
(Tweedie dispersion) 0.195 0.14 0.27 NA
(Tweedie power) 1.098 NA NA NA
PFUnDA
(Intercept) 0.012 0.00 0.09 0.00
AreaII 0.527 0.12 2.29 0.39
AreaIII 1.956 0.56 6.90 0.30
AreaIV 0.965 0.28 3.31 0.95
AreaV 4.530 1.51 13.55 0.01
Mud 1.018 1.00 1.04 0.08
(Tweedie dispersion) 0.234 0.20 0.27 NA
(Tweedie power) 1.040 NA NA NA
Code
## other reg table implementation
tbl_merge(
  (pfas_w_models)$reg_table,
    tab_spanner =  pfas_w_models %>% pull(pfas) %>% as.character()
    ) %>% as_gt()
Characteristic Σ9PFAS PFOS PFOA PFNA PFUnDA
exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value exp(Beta) 95% CI1 p-value
Area














    I




    II 0.88 0.51, 1.51 0.6 0.90 0.50, 1.61 0.7 0.86 0.51, 1.46 0.6 0.80 0.47, 1.36 0.4 0.53 0.12, 2.29 0.4
    III 0.95 0.51, 1.76 0.9 1.42 0.77, 2.64 0.3 0.67 0.35, 1.27 0.2 0.75 0.40, 1.42 0.4 1.96 0.56, 6.90 0.3
    IV 0.73 0.43, 1.26 0.3 0.93 0.53, 1.64 0.8 0.62 0.35, 1.07 0.087 0.65 0.37, 1.11 0.11 0.96 0.28, 3.31 >0.9
    V 1.69 0.89, 3.22 0.11 2.40 1.32, 4.37 0.004 0.89 0.42, 1.87 0.7 1.09 0.56, 2.11 0.8 4.53 1.51, 13.6 0.007
Mud 1.01 1.00, 1.02 0.016 1.01 1.01, 1.02 0.002 1.00 1.00, 1.01 0.4 1.01 1.00, 1.02 0.013 1.02 1.00, 1.04 0.077
1 CI = Confidence Interval

Observations above LOQ

Code
pfas_data_eq0 %>%
  filter(pfas != "Σ9PFAS") %>%
  group_by(pfas) %>%
  mutate(max_loq = max(loq, na.rm = TRUE)) %>% #distinct(pfas, max_loq) %>%  View()
  summarise(
    n = sum(pfas_value > max_loq),
    n_samples = n_distinct(sample_id3)
  ) %>%
  mutate(
    p = n / n_samples
  ) %>% arrange(desc(p)) %>%
  select(-n_samples) %>%
  gt() %>%
  fmt_percent(columns = vars(p), decimals = 0) %>%
  # tab_header(
  #   title = md("Number of samples above LOQ"),
  #   subtitle = md("Number of samples above LOQ for each PFAS, and the proportion of samples above LOQ.")
  # ) %>%
  cols_label(
    pfas = md("**PFAS**"),
    n = md("**n > LOQ**"),
    p = md("**Proportion > LOQ**")
  )
Table 1: Number of observations above LOQ for each PFAS out of 95 sampled locations..
PFAS n > LOQ Proportion > LOQ
PFOA 73 77%
PFNA 70 74%
PFOS 70 74%
PFUnDA 23 24%
PFDA 22 23%
PFHpDA 19 20%
PFHxDA 7 7%
PFDoDA 1 1%
PFTrDA 1 1%
Table 2: Samples and fraction above the different LOQs for each PFAS.
Code
between_maxloqloq <- pfas_raw %>%
  group_by(pfas) %>%
  mutate(max_loq = max(loq, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(sample_id3) %>%
  mutate(
    pfas_value = if_else(
      pfas == "Σ9PFAS",
      sum(pfas_value[pfas != "Σ9PFAS"], na.rm = TRUE),
      pfas_value
    )
  ) %>%
  mutate(
    pfas = pfas_renamer(pfas)
  ) %>%
  mutate(pfas = factor(pfas, levels = c("PFHxDA","PFHpDA", "PFOA", "PFNA", "PFDA", "PFUnDA", "PFDoDA", "PFTrDA", "PFOS", "Σ9PFAS"))) %>% group_by(pfas) %>% 
mutate(
    n_above_loq =  sum(pfas_value >= loq, na.rm = FALSE),
    n_between_loqs = sum(pfas_value >= loq & pfas_value < max_loq, na.rm = FALSE),
    n_above_max_loq = sum(pfas_value >= max_loq, na.rm = FALSE),
    frac_between = 100*(n_between_loqs/(n_above_max_loq+n_between_loqs)),
    min_loq = min(loq, na.rm = FALSE),
    max_loq = max(loq, na.rm = FALSE),
  ) %>% 
  mutate(Area = factor(Area, levels = c("I","II", "III", "IV", "V")))

between_maxloqloq %>% distinct(pfas, .keep_all = TRUE) %>%
  select(
    pfas, n_above_loq, n_between_loqs, n_above_max_loq, frac_between, min_loq, max_loq) %>% ungroup() %>% 
gt() %>% 
  # color n_between_loqs by frac_between:
  data_color(
    columns = frac_between,
    fn = scales::col_numeric(
      palette = "BuPu",
      domain = c(0, 50)
    )
  ) %>% 
  fmt_number(columns = frac_between,
             decimals = 0) %>% 
  cols_label(
    pfas = "PFAS",
    n_above_loq = "> LOQ",
    n_between_loqs = "> LOQ & < maxLOQ",
    n_above_max_loq = "> maxLOQ",
    frac_between = "% between",
    min_loq = "min LOQ",
    max_loq = "max LOQ"
  ) %>% gtsave("tableDL.html")
Code
between_maxloqloq %>% 
  filter(str_detect(pfas, "9PFAS", negate = TRUE)) %>%
  ggplot(aes(Area, pfas_value, color = frac_between))+
  geom_jitter(data = . %>% filter(
    pfas_value > loq & pfas_value < max_loq
  ), width = 0.2)+
  geom_jitter(data = . %>% filter(
    pfas_value > max_loq | pfas_value < loq
  ), width = 0.2, color = "black", alpha = 0.5)+
  geom_point(aes(Area, loq), size = 5, color = "red", shape = "_")+
  geom_hline(aes(yintercept = max_loq), color = "red", linetype = "dotted") + 
  facet_wrap(~pfas, scales = "free")+
  scale_color_gradient(low = "white", high = "purple", limits = c(0, 50))+
    scale_x_discrete(limits = c("I", "II", "III", "IV", "V"))

Code
between_maxloqloq %>% 
  filter(str_detect(pfas, "9PFAS", negate = TRUE)) %>%
  ggplot(aes(latitude, loq, color = Area))+geom_point()+
  scale_color_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c")) +
  facet_wrap(~pfas)

TOC vs Mud

Code
model_TOCmid <-  
  glmmTMB(data = pfas_data_eq0_wide, formula = TOC ~ Mud + Area, family = Gamma(link = "log"))

# model_TOCmid %>% 
model_TOCmid %>% parameters(exponentiate = TRUE) %>% select(Parameter, "exp(Estimate)" = Coefficient, p) %>% gt() %>% fmt_number(columns = "exp(Estimate)", decimals = 3) %>% fmt_scientific(columns = p,n_sigfig = 1) %>% 
    tab_header(title = "Regression table", subtitle = "Using a GLM with formula TOC ~ Mud + Area with log link and Gamma distributed errors.")
Regression table
Using a GLM with formula TOC ~ Mud + Area with log link and Gamma distributed errors.
Parameter exp(Estimate) p
(Intercept) 0.185 3 × 10−17
Mud 1.023 8 × 10−30
AreaII 1.584 6 × 10−4
AreaIII 1.415 3 × 10−2
AreaIV 0.973 8 × 10−1
AreaV 1.755 5 × 10−3
(Intercept) 0.429 NA
Code
ggplot(pfas_data_eq0_wide, aes(Mud, TOC)) +
  geom_point(aes(color = Area)) +
  geom_smooth(method = "lm", color = "#AF2D2D") +
  scale_color_manual(values = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c")) +
  xlab("Mud [%]") +
  ylab("TOC")

PFAS vs TOC

Code
pfas_glm_toc <- function(in_dat){
  glmmTMB(data = in_dat, formula = pfas_value ~ Area + TOC, family = tweedie(link = "log"))
}

pfas_w_models_toc <- pfas_data_eq0 %>%
  filter(!(pfas %in% c("PFHxDA", "PFDoDA", "PFTrDA"))) %>%
  ## inspect effect of excluding ares with high PFAS concentrations and high Mud content
  # filter(!(Area %in% c("I", "V"))) %>%
  mutate(
    # pfas = fct_relevel(pfas, c("Σ9PFAS", "PFOS", "PFOA", "PFNA", "PFUnDA")),
    pfas_label = pfas
  ) %>%
  arrange(pfas) %>%
  group_by(pfas) %>%
  nest() %>%
  mutate(
    m_fit = map(data, pfas_glm_toc),
    tidy = map(m_fit, ~ broom.mixed::tidy(.x, exponentiate = TRUE)),
    glance = map(m_fit, broom.mixed::glance),
    augment = map2(m_fit, data, ~ broom.mixed::augment(.x, .y)),
    pred = map(m_fit, ~ ggpredict(.x, terms = c("TOC [0:6]", "Area"))),
    simulated = map(m_fit, ~ simulate(.x, nsim = 25, seed = 12345)),
    sim_data = map2(
      simulated, data, ~ bind_cols(.x, .y) %>%
        pivot_longer(cols = matches("sim"), names_to = "simgroup", values_to = "sim") %>%
        rename(group = Area)
    ),
    Area_model = map2(m_fit, pfas, ~ glht(.x, linfct = mcp(Area = "Tukey"))),
    # # validate the parameters:: implementation
    # Area_hypothesis = map2(m_fit, pfas, ~ glht(.x, linfct = mcp(Area = "Tukey")) %>%
    #   summary(test = adjusted(type = "single-step")) %>%
    #   tidy() %>%
    #   select(-c(term, null.value)) %>%
    #   mutate(estimate = exp(estimate))),
    reg_table = map(m_fit, ~ tbl_regression(.x, exponentiate = TRUE, tidy_fun = broom.mixed::tidy), estimate_fun = purrr::partial(style_ratio, digits = 3))
  )

pfas_w_models_toc %>% select(pfas, tidy) %>% unnest(tidy) %>% filter(term == "TOC") %>% select(pfas, term, estimate, p.value)
# A tibble: 7 × 4
# Groups:   pfas [7]
  pfas   term  estimate p.value
  <fct>  <chr>    <dbl>   <dbl>
1 PFHpDA TOC      0.834  0.525 
2 PFOA   TOC      0.854  0.168 
3 PFNA   TOC      0.996  0.970 
4 PFDA   TOC      1.31   0.130 
5 PFUnDA TOC      1.37   0.0400
6 PFOS   TOC      1.18   0.0588
7 Σ9PFAS TOC      1.08   0.465 
Code
sim_data_toc <- pfas_w_models_toc %>%
  unnest(sim_data)

pred_data_toc <- pfas_w_models_toc %>% unnest(pred)

pred_data_toc %>% ggplot(aes(x, predicted)) +
  geom_smooth(color = "#AF2D2D", se = FALSE) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  geom_point(
    data = sim_data_toc,
    aes(x = TOC, y = sim),
    alpha = 0.1,
    size = 0.2,
    color = muted("red", l = 50, c = 70)
  ) +
  geom_point(aes(x = TOC, y = pfas_value),
             data = sim_data %>% distinct(sample_id3, .keep_all = TRUE)) +
  facet_grid(pfas ~ group, scales = "free") +
  xlab("TOC [%]") +
  ylab("PFAS [\u03BCg/kg]")

Concentrations table

Code
pfas_data_0_wide %>% select(-sample_id3) %>% gt(groupname_col = "Area",rowname_col = "year") %>%
  fmt_number(columns = 2:6, decimals = 2) %>%
  fmt_number(columns = depth, decimals = 0) %>%
  fmt_number(columns = Mud, decimals = 0) %>%
  cols_label(
    longitude = md("Longitude"),
    latitude = md("Latitude")
) %>% tab_spanner(label = "Concentration [µg/kg dry weight]", columns = "PFOS":"Σ9PFAS") %>% 
  sub_values(
  columns = everything(),
  rows = everything(),
  values = "0.00",
  replacement = "-"
) #%>% 
Longitude Latitude Mud TOC depth Concentration [µg/kg dry weight]
PFOS PFHxDA PFHpDA PFOA PFNA PFDA PFUnDA PFDoDA PFTrDA Σ9PFAS
I
2016 28.33 78.15 90 1.28 307 0.21 - - 0.47 0.13 - - - - 0.80
2017 11.25 77.69 95 1.88 303 0.14 - - 0.36 0.21 - - - - 0.71
2017 14.58 76.52 68 1.42 277 0.08* - - 0.16 0.13 - - - - 0.36
2018 11.71 78.98 97 1.42 311 0.18 - - 0.29 0.36 0.08* - - - 0.91
2018 11.65 79.10 90 1.08 275 0.16 - - 0.09* 0.09* - - - - 0.33
2018 11.42 79.02 98 1.79 346 0.12 - - 0.05* - - - - - 0.16
2018 22.25 80.04 94 1.08 145 0.25 - 0.11 0.79 0.53 - - - - 1.68
2018 22.15 80.09 98 1.33 213 1.00 - 0.16 1.14 0.81 0.10 0.23 - - 3.44
2018 22.13 80.17 96 1.23 208 0.97 - 0.05* 1.22 0.87 0.16 0.19 - - 3.46
2019 31.68 79.82 95 2.09 207 0.17 - - 0.22 0.23 - 0.09* - - 0.70
2019 10.74 79.04 96 2.15 321 0.12 - - 0.08* 0.10 - 0.07* - - 0.36
2019 12.26 78.91 96 0.35 82 - - - - - - - - - -
2019 6.16 79.10 95 1.52 1,221 0.07* - - 0.08* - - 0.08* - - 0.23
2022 26.95 81.41 96 1.58 866 0.41 - 0.16 0.66 0.33 0.10 0.25 - - 1.92
2022 22.17 81.50 84 1.31 855 0.20 - 0.12 0.47 0.21 0.06* 0.18 - - 1.24
2022 16.11 80.40 97 2.07 416 0.37 - 0.13 0.50 0.63 0.13 - - - 1.76
2022 8.96 79.66 18 0.45 419 0.14 - 0.08* 0.33 0.20 - 0.11* - - 0.86
2022 28.95 80.50 98 1.71 374 1.06 - 0.29 1.41 0.89 0.18 0.28 - - 4.10
II
2009 15.50 72.42 29 0.45 611 0.12 - - 0.21 0.10 - 0.11* - - 0.53
2009 14.60 72.27 48 0.54 1,037 - - - - - - - - - -
2009 15.67 72.28 22 0.27 728 - - - 0.10 - - - - - 0.10
2009 16.55 72.15 35 0.49 385 0.08* 0.02* 0.03* 0.28 0.07* 0.01* - - - 0.49
2009 16.91 72.04 41 0.74 343 0.64 0.11 0.11 0.67 0.33 0.15 0.24 - - 2.25
2009 16.75 71.86 52 0.82 357 0.87 0.12 0.12 0.90 0.41 0.15 0.35 - - 2.91
2009 15.75 71.71 26 0.25 778 - - - - - - - - - -
2010 25.79 72.34 35 0.41 250 0.15 - - 0.39 0.15 - - - - 0.69
2010 25.84 72.10 48 0.47 236 0.37 - - 0.57 0.25 - - - - 1.19
2015 34.30 73.21 60 1.17 207 0.40 - - 0.68 0.40 0.09 0.11* - - 1.69
2015 36.26 73.35 57 1.52 230 0.19 - - 0.45 0.26 - - - - 0.89
2015 36.58 73.60 68 2.32 258 - - - 0.08* - - - - - 0.08
2016 27.37 71.00 81 1.65 134 0.09 - - 0.19 0.13 - - - - 0.41
2016 26.42 76.02 84 2.35 182 0.27 - - 0.34 0.29 - 0.13* - - 1.02
2016 25.60 75.00 87 2.90 208 0.27 - - 0.33 0.25 - - - - 0.86
2016 25.53 74.39 73 1.92 297 0.61 0.12 0.19 1.28 0.52 0.11 0.12* - - 2.95
2016 26.11 74.15 78 1.69 410 0.26 0.13 0.16 1.16 0.53 - 0.12* - - 2.36
2017 21.49 74.30 89 3.14 203 0.18 - - 0.31 0.22 - - - - 0.71
2017 33.78 73.97 85 2.10 336 - - - 0.24 0.13 - - - - 0.37
2017 36.10 74.68 96 3.60 265 - - - 0.08* 0.10 - - - - 0.18
2017 36.68 74.89 87 2.70 204 - - - - - - - - - -
2019 18.57 74.79 74 1.79 246 0.12 - - 0.11 0.11 - - - - 0.33
2019 17.64 74.81 82 1.82 301 0.19 - - 0.29 0.27 0.07* 0.06* - - 0.88
2019 14.61 74.93 91 1.01 1,483 0.06* - - 0.06* - - 0.08* - - 0.21
2021 27.16 75.12 90 2.69 260 0.26 0.05* 0.15 0.62 0.45 0.08* 0.12* - - 1.74
2021 25.56 75.55 75 2.03 170 0.17 - - 0.18 0.14 - 0.06* - - 0.55
2021 21.86 74.19 86 2.38 304 0.16 - - 0.20 0.17 - - - - 0.54
III
2007 18.02 70.16 26 0.80 324 0.22 - - 0.20 - - - - - 0.42
2007 17.73 70.19 13 0.28 252 - - - 0.12 - - - - - 0.12
2007 18.15 70.14 22 0.66 363 0.34 - - 0.33 0.18 - - - - 0.85
2007 17.13 69.85 24 0.36 309 - - - 0.10 - - - - - 0.10
2008 16.33 69.26 30 1.06 482 0.35 - - 0.43 0.32 - 0.19 - - 1.30
2010 17.03 70.39 18 0.44 1,072 - - - - - - - - - -
2010 16.91 70.90 14 0.30 937 - - - - - - - - - -
2010 19.57 70.48 16 0.60 304 - - - - - - - - - -
2010 20.83 70.77 41 0.45 247 0.20 - - 0.28 0.14 - - - - 0.62
2010 21.05 70.70 29 0.54 259 0.09 - - 0.16 0.10 - - - - 0.35
2010 20.84 70.68 40 0.61 203 0.24 - - 0.29 0.12 - - - - 0.65
2010 20.24 70.77 40 0.46 213 0.29 - - 0.46 0.18 - - - - 0.93
2011 27.76 71.45 80 0.86 403 - - - 0.09* 0.03* 0.01* - - - 0.13
2014 30.92 69.90 97 1.78 314 0.23 - - 0.36 0.21 - - - - 0.80
2014 29.91 70.86 64 0.84 378 0.46 - - 0.61 0.42 0.10 0.19 - - 1.78
2014 29.66 71.06 32 0.41 339 0.20 - - 0.30 0.27 - - - - 0.77
2014 29.20 71.32 59 0.76 362 0.54 0.13 0.11 0.77 0.38 0.12 0.24 - - 2.28
2021 21.13 69.99 94 1.88 282 0.67 - - 0.28 0.29 0.17 0.27 - - 1.67
2021 20.76 69.85 94 1.97 106 0.56 - - 0.21 0.22 0.15 0.20 - - 1.33
2021 21.33 70.06 94 2.17 345 0.63 - - 0.26 0.25 0.14 0.31 - - 1.58
2021 21.73 70.04 97 2.48 345 0.49 - - 0.22 0.16 0.09 0.12* - - 1.08
IV
2011 11.16 67.80 56 0.54 264 - - - 0.30 0.10 - - - - 0.40
2011 8.99 67.79 63 0.58 855 - - - - - - - - - -
2012 10.08 68.35 71 0.74 1,963 - - - 0.01* - - - - - 0.01
2013 4.69 63.03 80 1.36 768 - - - 0.31 0.44 0.09 0.15 - - 0.99
2014 5.57 63.59 87 1.46 767 - - - - - - - - - -
2015 8.24 66.56 61 0.50 330 0.05* 0.07* 0.06* 0.42 0.12 - - - - 0.73
2020 7.57 63.88 36 0.32 236 0.18 - 0.07* 0.35 0.22 0.06* 0.11* - - 0.98
2020 8.41 64.28 88 0.90 357 0.48 - 0.11 0.47 0.36 0.11 0.17 - - 1.70
2020 8.22 64.71 82 0.63 238 0.36 - 0.06* 0.38 0.21 0.06* - - - 1.06
2020 10.43 66.83 88 0.74 400 0.46 - 0.11 0.46 0.34 0.08* - - - 1.45
2020 10.48 66.23 73 0.58 294 0.27 - 0.07* 0.41 0.19 0.06* - - - 1.00
2020 10.18 65.68 90 0.71 396 0.35 - 0.07* 0.56 0.24 - 0.06* - - 1.29
2020 9.08 65.72 93 0.75 449 0.32 - 0.11 0.46 0.37 0.10 - - - 1.36
2021 6.08 62.50 95 2.15 73 0.28 - - 0.06* 0.06* - 0.08* - - 0.47
2021 6.18 62.58 88 4.50 180 0.73 - - 0.07* 0.12 0.14 0.23 - - 1.29
2021 6.54 62.73 87 4.52 104 0.33 - - 0.10 0.09 0.06* 0.07* - - 0.65
2021 6.39 62.67 85 1.29 90 0.25 - - 0.05* - - 0.07* - - 0.37
2021 6.49 65.36 80 0.64 419 0.32 - 0.17 0.49 0.42 0.10 0.17 - - 1.66
2021 2.32 62.84 70 0.58 780 0.04* - - 0.08* - - 0.08* - - 0.19
2021 4.89 64.61 95 1.20 1,039 0.25 - 0.06* 0.18 0.09 0.09* 0.21 - - 0.87
2021 9.34 64.52 80 0.63 294 0.37 0.13 0.16 0.57 0.28 0.08* 0.16 - - 1.74
2021 10.82 65.68 50 0.43 384 0.24 - 0.09* 0.35 0.19 - 0.12* - - 0.98
2021 12.49 66.83 44 0.49 240 0.20 - 0.07* 0.21 0.17 - 0.10* - - 0.76
V
2020 6.04 59.23 97 2.33 348 1.24 0.06* 0.06* 0.53 0.50 0.25 0.35 - - 3.00
2020 5.82 59.30 97 2.19 702 1.68 0.13 0.16 0.82 0.79 0.38 0.44 - - 4.40
2020 5.54 59.09 88 2.09 283 0.40 - - 0.19 0.20 0.08* 0.16 - - 1.03
2020 5.77 58.94 84 5.50 240 0.64 - - 0.17 0.05* 0.12 0.67 0.29 0.38 2.32
2022 10.25 59.03 97 1.90 166 0.28 - - 0.13 0.19 - 0.15 - - 0.75
2022 4.70 58.81 96 1.57 277 0.33 - 0.17 0.56 0.43 0.07* 0.12* - - 1.67
Code
  #gtsave("si_conc_table.docx")

Concentrations and Mud % for each PFAS by area and year. The highest LOQ in any sample for each PFAS is used, values below LOQ indicated with a dash.

Other

Tweedie distribution

Code
#simulate 1000 tweedie using mu= 1 and mu = 3:
mutibble <- rbind(tibble(mu = c(rep(1, 10000))), tibble(mu = c(rep(2, 10000))), tibble(mu = c(rep(3, 10000)))) %>% 
  mutate(
  )
simtibble <- mutibble %>%
  mutate(
    sim = rTweedie(mu = mu, p = 1.404, 0.666),
    mu = str_replace(mu, "\\d", paste("\u03BC = ", mu))
  )
simtibble %>% ggplot(aes(x = sim)) +
  geom_histogram(binwidth = 0.05, color = "black") +
  facet_wrap(~mu)+
  geom_vline(aes(xintercept = str_extract(mu, "\\d") %>% as.numeric()), linetype = "dotted", color = "red")+
  xlab("Response value")+
  ylab("Frequency [#]")
Figure 5: Simulated Tweedie distributions for μ = 1, 2 and 3, illustrating property to model censored data.

Chemical compounds table

Code
pfas_table <- tibble(
  pfas = c(
    "PFHxA",
    "PFHpA",
    "PFOA",
    "PFNA",
    "PFDA",
    "PFUnDA",
    "PFDoDA",
    "PFTrA",
    "PFOS"
  ),
  pfas_names = c(
    "Perfluorohexanoic acid",
    "Perfluoroheptanoic acid",
    "Perfluorooctanoic acid",
    "Perfluorononanoic acid",
    "Perfluorodecanoic acid",
    "Perfluoroundecanoic acid",
    "Perfluorododecanoic acid",
    "Perfluorotridecanoic acid",
    "Perfluorooctanesulfonic acid"
  )
) %>% 
  mutate(
    pc_data = map(pfas_names, ~ pcprop_from_name(.x, property = "all"))
  ) %>%
  unnest(pc_data)
Code
pfas_table %>% 
  mutate(structure_depiction = CanonicalSMILES) %>% 
  select(pfas, pfas_names, CID, logP = XLogP, structure_depiction) %>% 
  gt() %>% 
  text_transform(
    locations = cells_body(columns = structure_depiction),
    fn = function(x) {
      local_image(
        filename = paste0(getwd(), "/figures/", x, ".svg"),
        height = 60
      )
    }
  ) %>% gtsave("test2.html")
# # rsvg convert
# library(rsvg)
# list.files("figures/", full.names = TRUE) %>% str_subset(".svg") %>% map(~ rsvg_pdf(.x, paste0(.x %>% str_replace(".svg", ".pdf"))))

LogP vs mud coefficient

Code
pfas_w_models %>%
  select(pfas, m_fit) %>%
  mutate(regression_table = map(
    m_fit,
    ~ model_parameters(.x, exponentiate = TRUE) %>% add_row(
      Parameter = "(Tweedie power)",
      Coefficient = glmmTMB:::.tweedie_power(.x)
    )
  )) %>%
  unnest(regression_table) %>%
  mutate(
    Parameter = if_else(
      Component == "dispersion" &
        Parameter == "(Intercept)",
      "(Tweedie dispersion)",
      Parameter
    )
  ) %>%
  select(pfas, Parameter, Estimate = Coefficient, CI_low, CI_high, p) %>% left_join(pfas_table, by = "pfas") %>%
  filter(Parameter == "Mud") %>%
  ggplot(aes(x = XLogP, y = Estimate)) + geom_point() +
  #add geom_text of the pfas for each point:
  geom_text_repel(aes(label = pfas), ) +
  geom_errorbar(aes(ymin = CI_low, ymax = CI_high)) +
  scale_y_continuous(breaks = c(1, 1.01, 1.02, 1.03, 1.04, 1.05))